package com.dcivision.lucene.extractor;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.dcivision.framework.ApplicationException;
import com.dcivision.framework.SystemParameterConstant;
import com.dcivision.framework.SystemParameterFactory;

public class XPDFHandler implements DocumentHandler {
  
  private static final Log log = LogFactory.getLog(XPDFHandler.class);
  
  public XPDFHandler() {

  }

  public String getDocumentFullText(InputStream is) throws ApplicationException {
    if (is == null)
      return null;
    
    //step 1: save pdf file to cache folder
    String cacheFolder = SystemParameterFactory.getSystemParameter(SystemParameterConstant.DMS_EMAILARCHIVE_CACHEFOLDER_PATH);
    String cacheFileName = cacheFolder + ""+new java.util.Date().getTime()+"_"+(Math.random() * 1000)+".pdf";
       
    FileOutputStream out = null;
    try {
      out = new FileOutputStream(cacheFileName);
      byte[] buffer = new byte[8192];
      int length = -1;
      while ((length=is.read(buffer))!=-1) {
        out.write(buffer, 0, length);
      }
    }catch (Exception ex) {
      // file save failure, cacel extractor action.
      log.error(ex, ex);
      return null;
    }finally {
      try {out.close();}catch (Exception ignore) {}
    }
    
    //step 2: get full text from file by execute a command.
    String xpdfPath = SystemParameterFactory.getSystemParameter(SystemParameterConstant.DMS_INDEX_EXTRACTOR_XPDF_PATH);
    xpdfPath = (xpdfPath==null || xpdfPath.equals("") || xpdfPath.equals("null")) ? "C:\\xpdf" : xpdfPath;
    
    String xpdfExePath = xpdfPath+"\\pdftotext.exe";
    String[] cmd = new String[] { xpdfExePath, "-cfg", "xpdfrc", "-q", cacheFileName, "-" };
    
    
    StringBuffer sb = null;;
    InputStreamReader reader = null;
    try {
      sb = new StringBuffer();
      
      Process p = Runtime.getRuntime().exec(cmd);
      BufferedInputStream bis = new BufferedInputStream(p.getInputStream());
      reader = new InputStreamReader(bis, "UTF-8");

      char[] buf = new char[8192];
      int length = -1;
      while ((length=reader.read(buf))!=-1) {
        sb.append(buf, 0, length);
      }
    } catch (Exception ex) {
      log.error(ex, ex);
      return null;
    }finally {
      try {reader.close();}catch (Exception ex) {}
    }
    
    //step 3: delete cache pdf file
    File file = new File(cacheFileName);
    if (file.exists()) file.delete();
    
    return sb.toString();
  }

}
